*********** User input: define parameters of the analytic sample *******

	cap program drop keep_analytic_sample
	program keep_analytic_sample	
		* Keep only years 2005-2017
		keep if year>2004 
		* Keep only individual-year observations with a CZ code
		keep if !missing(cz90)
		* Keep only individual-year observations for age 20 to 70
		keep if inrange(age,20,70)
	end	

************************************************************************

	use "${build_data}/panel_lawyers", clear

	destring personid, gen(id)

	***
	***
	
    * Demographics 

	gen age = year-birthyr
	drop if missing(age) 
	drop if deathyr<=year // not including the year of death
	drop if birthyr>year  // drops individuals with missing birthyr	
	
	gen female=(gender=="F")
	drop gender
	
	gen married=(filing_status==2 | filing_status==3) if !missing(filing_status)
	gen spouse_in_labor_force=(!missing(count_w2s_spouse)) if married==1 & year>2004


	***
	***

	* Geography 

	rename (stfips county zcta) (m_statecode m_cnty m_zip_area)

	local t "informational file" 
	local m "individual file"
	
	gen stfips = m_statecode if mvar=="`t'"
	replace stfips = substr(cfips_from_zip,1,2) if stfips=="" & zvar=="`t'"
	replace stfips = m_statecode if stfips=="" & mvar=="`m'"
	replace stfips = substr(cfips_from_zip,1,2) if stfips=="" & zvar=="`m'"
	replace stfips = m_statecode if stfips=="" & !inlist(mvar,"`t'","`m'")
	replace stfips = "0"+stfips if strlen(stfips) == 1

	gen cfips = m_statecode+m_cnty if mvar=="`t'"
	replace cfips = cfips_from_zip if cfips=="" & zvar=="`t'"
	replace cfips = m_statecode+m_cnty if cfips=="" & mvar=="`m'"
	replace cfips = cfips_from_zip if cfips=="" & zvar=="`m'"
	replace cfips = m_statecode+m_cnty if cfips=="" & !inlist(mvar,"`t'","`m'")
	replace cfips = "0" + cfips if strlen(cfips) == 4
	
	tostring(zipfive), replace
	replace zipfive="0"+zipfive if length(zipfive)==4
	replace zipfive="00"+zipfive if length(zipfive)==3
	replace zipfive="" if length(zipfive)!=5
	
	merge m:1 zipfive using "${intermediate_data}/crosswalks/zip_to_cfips_xwalk.dta", keep(master match) nogen
	
	replace cfips = zcfips if mi(cfips) & !missing(zipfive)
	replace stfips = substr(zcfips, 1, 2) if mi(stfips) & !missing(zipfive)
		
	merge m:1 cfips using "${intermediate_data}/crosswalks/cfips_cz_state_xwalk.dta", keep(master match) nogen 
	drop cz80 	

	destring cz90, replace
	

	***
	***

	* Income
	* All dollar-denominated variables onverted to 2017 dollars
	* See Appendix B.2 for explanation of income measures
	
	fmerge m:1 year using  "${raw_data}/cps/cpi99.dta", assert(match using) keep(match) nogen 
	
	foreach var in aginc w2wgs_salary amount_tax_dividends amount_tax_interest social_security schde_profit_loss ///
			amount_tax_exempt_inc total_money_income tax_distr_1 tax_distr_2 w2wgs total_compensation  {
	replace `var' = `var' * cpi99 * conversion_const1999to2017  if !missing(`var')
	}
				
		
	foreach var in w2wgs_salary amount_tax_dividends amount_tax_interest social_security schde_profit_loss amount_tax_exempt_inc tax_distr_1 tax_distr_2 {
	replace `var' = 0 if missing(`var') & !missing(aginc)
	}
			
	foreach var in w2wgs total_compensation {
	replace `var' = 0 if missing(`var') & year>2004
	}
	
	gen filer_flag=(!missing(aginc))
	
		* Taxable social-security income 
	
	gen txsocial_security = .
	replace txsocial_security = 0 if social_security == 0
	replace txsocial_security = 0 if (aginc+amount_tax_exempt_inc+social_security*.5)<0
	replace txsocial_security = 0 if (aginc+amount_tax_exempt_inc+social_security*.5-(25000+7000*(filing_status==2)))<0 & txsocial_security==.
	replace txsocial_security = min(min((.5*(min(aginc+amount_tax_exempt_inc+social_security*.5-(25000+7000*(filing_status==2)),9000+3000*(filing_status==2)))),(social_security*.5)) + .85*(aginc+amount_tax_exempt_inc+social_security*.5-(25000+7000*(filing_status==2))-(9000+3000*(filing_status==2))),social_security*.85) if txsocial_security==.
	
		* Taxable pension distributions
	
	gen txpension=(tax_distr_1+tax_distr_2)*(age>59)
	
		* Wage earnings
	
	gen totw2comp=total_compensation 
	
		* Individual total income
	
	gen ptotinc = totw2comp + (aginc - w2wgs_salary - txpension + amount_tax_exempt_inc + (social_security-txsocial_security))  if filer_flag==1 & filing_status==2 
	replace ptotinc = totw2comp + (aginc - w2wgs_salary - txpension + amount_tax_exempt_inc + (social_security-txsocial_security)) if filer_flag==1 & filing_status!=2
	replace ptotinc = totw2comp if filer_flag==0
	assert !missing(ptotinc) if year>2004
	
		* Individual business income

	gen pftloss = total_money_income - w2wgs_salary - amount_tax_dividends - amount_tax_interest - social_security - schde_profit_loss - tax_distr_1 - tax_distr_2 if filer_flag==1 & filing_status==2 
	replace pftloss = total_money_income - w2wgs_salary - amount_tax_dividends - amount_tax_interest - social_security - schde_profit_loss - tax_distr_1 - tax_distr_2 if filer_flag==1 & filing_status!=2 
	replace pftloss = 0 if filer_flag==0
	assert !missing(pftloss) if year>2004
	
	* Derived measures of income 
	
	gen logptotinc=log(ptotinc)
	

	***
	***

	* Self-employment
		
		* Self employed in ACS (at a point in time)

	gen se_acs = inlist(cow,6,7) if lawyer_acs == 1 & year==year_survey
	label var se_acs "Self Employed based on ACS (measured in year of survey)"
	
		* Filed shedules C, or E, or SE

	gen se_schedules = max(schedule_c_flag, schedule_e_flag, schd_se)



	***
	***

	* Add ACS variables (note: these are not a panel)

		* Weekly work hours 
	
	replace wkh = . if year != year_survey
	gen wkh_imputed = wkh
	replace wkh_imputed = 0 if mi(wkh) & esr == 6 & year == year_survey

		* Number of weeks worked

	gen wkw_imputed = wkw
	replace wkw_imputed = 51 if wkw2 == 1 & missing(wkw) & year == year_survey
	replace wkw_imputed = 48.5 if wkw2 == 2 & missing(wkw) & year == year_survey
	replace wkw_imputed = 43.5 if wkw2 == 3 & missing(wkw) & year == year_survey
	replace wkw_imputed = 33 if wkw2 == 4 & missing(wkw) & year == year_survey
	replace wkw_imputed = 20 if wkw2 == 5 & missing(wkw) & year == year_survey
	replace wkw_imputed = 7 if wkw2 == 6 & missing(wkw)	& year == year_survey
	

	***
	***

	* Firms

	gen countvar = 1

		* Define firms
		* Assign solo proprietor firm id if no EIN, but have C, E, or SE schedules

	gen ein = ein_num
	replace ein = personid if se_schedules == 1 & mi(ein_num)
	
		* Employer size (lawyers only), time-varying

	gegen einsize = sum(countvar) if !missing(ein), by(ein year)
	label var einsize "Size of EIN (lawyers only) in a given year"



	***
	***
		
	 keep id personid year year_survey source lawyer_acs ///
	      age age_survey female stfips cfips cz90 cz00 zipfive ///
	      filer_flag logptotinc ptotinc aginc ///
	      wkh wkh_imputed married is_foreign spouse_in_labor_force race einsize wkw_imputed
	
	 order id personid year year_survey source lawyer_acs ///
	      age age_survey female stfips cfips cz90 cz00 zipfive ///
	      filer_flag ptotinc aginc ///
	      wkh wkh_imputed married einsize wkw_imputed
	      
	gsort id year
	
	keep_analytic_sample
	
	gisid id year
	
	save "${clean_data}/panel_lawyers_clean.dta", replace
